# Always print this out before your assignment
sessionInfo()
getwd()
# load all your libraries in this chunk
library('tidyverse')
library("fs")
library('here')
library('dplyr')
library('tidyverse')
library('ggplot2')
library('ggrepel')
library('ggthemes')
library('forcats')
library('rsample')
library('lubridate')
library('ggthemes')
library('kableExtra')
library('pastecs')
library('viridis')
library('plotly')
library('tidyquant')
library('scales')
library("gdata")
# note, do not run install.packages() inside a code chunk. install them in the console outside of a code chunk.
1a) Loading data
#Reading the data in and doing minor initial cleaning in the function call
#Reproducible data analysis should avoid all automatic string to factor conversions.
#strip.white removes white space
#na.strings is a substitution so all that have "" will = na
data <- read.csv(here::here("final_project", "donor_data.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
1b) Fixing the wonky DOB & Data cleanup
#(Birthdate and Age, ID as a number)adding DOB (Age/Spouse Age) in years columns and adding two fields for assignment and number of children and number of degrees
dataclean <- data %>%
mutate(Birthdate = ifelse(Birthdate == "0001-01-01", NA, Birthdate)) %>%
mutate(Birthdate = mdy(Birthdate)) %>%
mutate(Age = as.numeric(floor(interval(start= Birthdate, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(Spouse.Birthdate = ifelse(Spouse.Birthdate == "0001-01-01", NA, Spouse.Birthdate)) %>%
mutate(Spouse.Birthdate = mdy(Spouse.Birthdate)) %>%
mutate(Spouse.Age = as.numeric(floor(interval(start= Spouse.Birthdate,
end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(ID = as.numeric(ID)) %>%
mutate(Assignment_flag = ifelse(is.na(Assignment.Number), 0,1)) %>%
mutate( No_of_Children = ifelse(is.na(Child.1.ID),0,
ifelse(is.na(Child.2.ID),1,2)))%>%
mutate(ID = as.numeric(ID)) %>%
mutate( nmb_degree = ifelse(is.na(Degree.Type.1),0,
ifelse(is.na(Degree.Type.2),1,2)))
#conferral dates
dataclean <- dataclean %>%
mutate(Conferral.Date.1 = ifelse(Conferral.Date.1 == "0001-01-01", NA, Conferral.Date.1)) %>%
mutate(Conferral.Date.1 = mdy(Conferral.Date.1)) %>%
mutate(Conferral.Date.1.Age = as.numeric(floor(interval(start= Conferral.Date.1, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(Conferral.Date.2 = ifelse(Conferral.Date.2 == "0001-01-01", NA, Conferral.Date.2)) %>%
mutate(Conferral.Date.2 = mdy(Conferral.Date.2)) %>%
mutate(Conferral.Date.2.Age = as.numeric(floor(interval(start= Conferral.Date.2, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(Last.Contact.By.Anyone = ifelse(Last.Contact.By.Anyone == "0001-01-01", NA, Last.Contact.By.Anyone)) %>%
mutate(Last.Contact.By.Anyone = mdy(Last.Contact.By.Anyone)) %>%
mutate(Last.Contact.Age = as.numeric(floor(interval(start= Last.Contact.By.Anyone, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(HH.First.Gift.Date = ifelse(HH.First.Gift.Date == "0001-01-01", NA, HH.First.Gift.Date)) %>%
mutate(HH.First.Gift.Date = mdy(HH.First.Gift.Date)) %>%
mutate(HH.First.Gift.Age = as.numeric(floor(interval(start= HH.First.Gift.Date, end=Sys.Date())/duration(n=1, unit="years"))))
#major gift
dataclean <-
dataclean %>%
mutate(major_gifter = ifelse(Lifetime.Giving > 50000, 1,0) %>% factor(., levels = c("0","1")))
#splitting up the age into ranges and creating category for easy visualization
dataclean <- dataclean %>%
mutate(age_range =
ifelse(Age %in% 10:19, "10 < 20 years old",
ifelse(Age %in% 20:29, "20 < 30 years old",
ifelse(Age %in% 30:39, "30 < 40 years old",
ifelse(Age %in% 40:49, "40 < 50 years old",
ifelse(Age %in% 50:59, "50 < 60 years old",
ifelse(Age %in% 60:69, "60 < 70 years old",
ifelse(Age %in% 70:79, "70 < 80 years old",
ifelse(Age %in% 80:89, "80 < 90 years old",
ifelse(Age %in% 90:120, "90+ years old",
NA))))))))))
#seeing what we have
table(dataclean$age_range)
10 < 20 years old 20 < 30 years old 30 < 40 years old
3985 24558 21037
40 < 50 years old 50 < 60 years old 60 < 70 years old
16851 20755 18257
70 < 80 years old 80 < 90 years old 90+ years old
12246 5984 6633
#50-60 is the most common age range
#creating a region column using the county data and the OMB MSA (Metropolitan Statistical Area) definitions
dataclean <- dataclean %>%
mutate(region =
ifelse(County == "San Luis Obispo" & State == "CA", "So Cal",
ifelse(County == "Kern" & State == "CA", "So Cal",
ifelse(County == "San Bernardino" & State == "CA", "So Cal",
ifelse(County == "Santa Barbara" & State == "CA", "So Cal",
ifelse(County == "Ventura" & State == "CA", "So Cal",
ifelse(County == "Los Angeles" & State == "CA", "So Cal",
ifelse(County == "Orange" & State == "CA", "So Cal",
ifelse(County == "Riverside" & State == "CA", "So Cal",
ifelse(County == "San Diego" & State == "CA", "So Cal",
ifelse(County == "Imperial" & State == "CA", "So Cal",
ifelse(County == "King" & State == "WA", "Seattle",
ifelse(County == "Snohomish" & State == "WA", "Seattle",
ifelse(County == "Pierce" & State == "WA", "Seattle",
ifelse(County == "Clackamas" & State == "OR", "Portland",
ifelse(County == "Columbia" & State == "OR", "Portland",
ifelse(County == "Multnomah" & State == "OR", "Portland",
ifelse(County == "Washington" & State == "OR", "Portland",
ifelse(County == "Yamhill" & State == "OR", "Portland",
ifelse(County == "Clark" & State == "WA", "Portland",
ifelse(County == "Skamania" & State == "WA", "Portland",
ifelse(County == "Denver" & State == "CO", "Denver",
ifelse(County == "Arapahoe" & State == "CO", "Denver",
ifelse(County == "Jefferson" & State == "CO", "Denver",
ifelse(County == "Adams" & State == "CO", "Denver",
ifelse(County == "Douglas" & State == "CO", "Denver",
ifelse(County == "Broomfield" & State == "CO", "Denver",
ifelse(County == "Elbert" & State == "CO", "Denver",
ifelse(County == "Park" & State == "CO", "Denver",
ifelse(County == "Clear Creek" & State == "CO", "Denver",
ifelse(County == "Alameda" & State == "CA", "Bay Area",
ifelse(County == "Contra Costa" & State == "CA", "Bay Area",
ifelse(County == "Marin" & State == "CA", "Bay Area",
ifelse(County == "Monterey" & State == "CA", "Bay Area",
ifelse(County == "Napa" & State == "CA", "Bay Area",
ifelse(County == "San Benito" & State == "CA", "Bay Area",
ifelse(County == "San Francisco" & State == "CA", "Bay Area",
ifelse(County == "San Mateo" & State == "CA", "Bay Area",
ifelse(County == "Santa Clara" & State == "CA", "Bay Area",
ifelse(County == "Santa Cruz" & State == "CA", "Bay Area",
ifelse(County == "Solano" & State == "CA", "Bay Area",
ifelse(County == "Sonoma" & State == "CA", "Bay Area",
NA))))))))))))))))))))))))))))))))))))))))))
dataclean <- dataclean %>%
mutate(region =
ifelse(County == "Kings" & State == "NY", "New York",
ifelse(County == "Queens" & State == "NY", "New York",
ifelse(County == "New York" & State == "NY", "New York",
ifelse(County == "Bronx" & State == "NY", "New York",
ifelse(County == "Richmond" & State == "NY", "New York",
ifelse(County == "Westchester" & State == "NY", "New York",
ifelse(County == "Bergen" & State == "NY", "New York",
ifelse(County == "Hudson" & State == "NY", "New York",
ifelse(County == "Passaic" & State == "NY", "New York",
ifelse(County == "Putnam" & State == "NY", "New York",
ifelse(County == "Rockland" & State == "NY", "New York",
ifelse(County == "Suffolk" & State == "NY", "New York",
ifelse(County == "Nassau" & State == "NY", "New York",
ifelse(County == "Middlesex" & State == "NJ", "New York",
ifelse(County == "Monmouth" & State == "NJ", "New York",
ifelse(County == "Ocean" & State == "NJ", "New York",
ifelse(County == "Somerset" & State == "NJ", "New York",
ifelse(County == "Essex" & State == "NJ", "New York",
ifelse(County == "Union" & State == "NJ", "New York",
ifelse(County == "Morris" & State == "NJ", "New York",
ifelse(County == "Sussex" & State == "NJ", "New York",
ifelse(County == "Hunterdon" & State == "NJ", "New York",
ifelse(County == "Pike" & State == "NJ", "New York",
region))))))))))))))))))))))))
# code nor cal region as all others in CA not already defined
dataclean <- dataclean %>%
mutate(region =
ifelse(State == "CA" & is.na(region) == TRUE, "Nor Cal", region))
#Removing Columns that provide no benefit
dataclean <- subset(dataclean,select = -c(Assignment.Number
,Assignment.has.Historical.Mngr
,Suffix
,Assignment.Date
,Assignment.Manager
,Assignment.Role
,Assignment.Title
,Assignment.Status
,Strategy
,Progress.Level
,Assignment.Group
,Assignment.Category
,Funding.Method
,Expected.Book.Date
,Qualification.Amount
,Expected.Book.Amount
,Expected.Book.Date
,Hard.Gift.Total
,Soft.Credit.Total
,Total.Assignment.Gifts
,No.of.Pledges
,Proposal..
,Proposal.Notes
,HH.Life.Spouse.Credit
,Last.Contact.By.Manager
,X..of.Contacts.By.Manager
,DonorSearch.Range
,iWave.Range
,WealthEngine.Range
,Philanthropic.Commitments
))
#cleaning up zip codes removing -4 after
dataclean$Zip <- gsub(dataclean$Zip, pattern="-.*", replacement = "")
#adding zip code data and column
zip <- read.csv(here::here("final_project", "Salary_Zipcode.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
#adding zip salary column
dataclean <-dataclean %>%
mutate(zipcode_slry = VLOOKUP(Zip, zip, NAME, S1902_C03_002E))
#slry range
dataclean <- dataclean %>%
mutate(zipslry_range =
ifelse(zipcode_slry %in% 10000:89999, "90K-99K",
ifelse(zipcode_slry %in% 90000:99999, "90K-99K",
ifelse(zipcode_slry %in% 100000:149999, "100K-149K",
ifelse(zipcode_slry %in% 150000:199999, "150K-199K",
ifelse(zipcode_slry %in% 200000:249999, "200K-249K",
ifelse(zipcode_slry %in% 250000:299999, "250K-299K",
ifelse(zipcode_slry %in% 300000:349999, "300K-349K",
ifelse(zipcode_slry %in% 350000:399999, "350K-399K",
ifelse(zipcode_slry %in% 400000:499999, "400K-499K",
ifelse(zipcode_slry %in% 500000:999999, "500K-999K",
NA)))))))))))
sum(is.na(dataclean$zipcode_slry))
[1] 62347
#adding scholarship data (y/n)
schlr <- read.csv(here::here("final_project", "scholarship.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
#adding scholarship column
dataclean <-dataclean %>%
mutate(scholarship = VLOOKUP(ID, schlr, ID, SCHOLARSHIP))
#replacing NA with 0
dataclean$scholarship <- replace_na(dataclean$scholarship,'0')
#replacing Y with 1
dataclean$scholarship<-ifelse(dataclean$scholarship=="Y",1,0)
#checking how many are N
table(dataclean$scholarship)
0 1
295264 27962
#checking and deleting scholarship column
class(dataclean$schlr_fct)
[1] "NULL"
dataclean = subset(dataclean, select = -c(scholarship))
#checking for duplicates N >1 indicates a records values are in the file twice
dataclean %>% group_by(ID) %>% count() %>% arrange(desc(n))
#removing duplicated records
dataclean <- unique(dataclean)
#n = 1 no ID with multiple records cleaned of dupes
dataclean %>% group_by(ID) %>% count() %>% arrange(desc(n))
NA
1d Creating many many factor variables
dataclean <-
dataclean %>%
#SEX
mutate(sex_fct =
fct_explicit_na(Sex),
sex_simple =
fct_lump_n(Sex, n = 4),
#MARRIED
married_fct =
fct_explicit_na(Married),
#DONOR SEGMENT
donorseg_fct =
fct_explicit_na(Donor.Segment),
donorseg_simple =
fct_lump_n(Donor.Segment, n = 4),
#CONTACT RULE
contact_fct =
fct_explicit_na(Contact.Rules),
contact_simple =
fct_lump_n(Contact.Rules, n = 4),
#SPOUSE MAIL
spomail_fct =
fct_explicit_na(Spouse.Mail.Rules),
spomail_simple =
fct_lump_n(Spouse.Mail.Rules, n = 4),
#JOB TITLE
jobtitle_fct =
fct_explicit_na(Job.Title),
jobtitle_simple =
fct_lump_n(Job.Title, n = 5),
#DEGREE TYPE 1
deg1_fct =
fct_explicit_na(Degree.Type.1),
deg1_simple =
fct_lump_n(Degree.Type.1, n = 5),
#DEGREE TYPE 2
deg2_fct =
fct_explicit_na(Degree.Type.2),
deg2_simple =
fct_lump_n(Degree.Type.2, n = 5),
#MAJOR 1
maj1_fct =
fct_explicit_na(Major.1),
maj1_simple =
fct_lump_n(Major.1, n = 5),
#MAJOR 2
maj2_fct =
fct_explicit_na(Major.2),
maj2_simple =
fct_lump_n(Major.2, n = 5),
#MINOR 1
min1_fct =
fct_explicit_na(Minor.1),
min1_simple =
fct_lump_n(Minor.1, n = 5),
#MINOR 2
min2_fct =
fct_explicit_na(Minor.2),
min2_simple =
fct_lump_n(Minor.2, n = 5),
#SCHOOL 1
school1_fct =
fct_explicit_na(School.1),
school1_simple =
fct_lump_n(School.1, n = 5),
#SCHOOL 2
school2_fct =
fct_explicit_na(School.2),
school2_simple =
fct_lump_n(School.2, n = 5),
#INSTITUTION TYPE
insttype_fct =
fct_explicit_na(Institution.Type),
insttype_simple =
fct_lump_n(Institution.Type, n = 5),
#EXTRACURRICULAR
extra_fct =
fct_explicit_na(Extracurricular),
extra_simple =
fct_lump_n(Extracurricular, n = 5),
#HH FIRST GIFT FUND
hhfirstgift_fct =
fct_explicit_na(HH.First.Gift.Fund),
hhfirstgift_simple =
fct_lump_n(HH.First.Gift.Fund, n = 5),
#CHILD 1 ENROLL STATUS
ch1_enroll_fct =
fct_explicit_na(Child.1.Enroll.Status),
ch1_enroll_simple =
fct_lump_n(Child.1.Enroll.Status, n = 4),
#CHILD 1 MAJOR
ch1_maj_fct =
fct_explicit_na(Child.1.Major),
ch1_maj_simple =
fct_lump_n(Child.1.Major, n = 4),
#CHILD 1 MINOR
ch1_min_fct =
fct_explicit_na(Child.1.Minor),
ch1_min_simple =
fct_lump_n(Child.1.Minor, n = 4),
#CHILD 1 SCHOOL
ch1_school_fct =
fct_explicit_na(Child.1.School),
ch1_school_simple =
fct_lump_n(Child.1.School, n = 4),
#CHILD 1 FEEDER
ch1_feeder_fct =
fct_explicit_na(Child.1.Feeder.School),
ch1_feeder_simple =
fct_lump_n(Child.1.Feeder.School, n = 4),
#CHILD 2 ENROLL STATUS
ch1_enroll_fct =
fct_explicit_na(Child.2.Enroll.Status),
ch2_enroll_simple =
fct_lump_n(Child.2.Enroll.Status, n = 4),
#CHILD 2 MAJOR
ch2_maj_fct =
fct_explicit_na(Child.2.Major),
ch2_maj_simple =
fct_lump_n(Child.2.Major, n = 4),
#CHILD 2 MINOR
ch2_min_fct =
fct_explicit_na(Child.2.Minor),
ch2_min_simple =
fct_lump_n(Child.2.Minor, n = 4),
#CHILD 2 SCHOOL
ch2_school_fct =
fct_explicit_na(Child.2.School),
ch2_school_simple =
fct_lump_n(Child.2.School, n = 4),
#CHILD 2 FEEDER
ch2_feeder_fct =
fct_explicit_na(Child.2.Feeder.School),
ch2_feeder_simple =
fct_lump_n(Child.2.Feeder.School, n = 4),
)
#checking to see if its a factor
#class(dataclean$sex_fct)
#class(dataclean$donorseg_fct)
#class(dataclean$contact_fct)
#class(dataclean$spomail_fct)
#checking levels
#levels(dataclean$sex_simple)
#levels(dataclean$donorseg_simple)
#levels(dataclean$contact_simple)
#levels(dataclean$spomail_simple)
#levels(dataclean$hhfirstgift_simple)
#creating a table against Sex column
#table(dataclean$sex_fct, dataclean$sex_simple)
Region Analysis
#grouping by region and analyzing
dataclean %>%
group_by(region) %>%
summarise(Count = length(region),
mean_total_giv = mean(HH.Lifetime.Giving)) %>%
arrange(-Count) %>%
filter(Count >= 100) %>%
mutate(mean_total_giv = dollar(mean_total_giv)) %>%
kable(col.names = c("Region", "Count", "Mean HH Lifetime Giving"), align=rep('c', 3)) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| Region | Count | Mean HH Lifetime Giving |
|---|---|---|
| So Cal | 145139 | $5,090.84 |
| NA | 130306 | $2,040.98 |
| Bay Area | 20641 | $755.92 |
| Nor Cal | 10707 | $3,823.63 |
| Seattle | 5425 | $922.08 |
| New York | 4959 | $1,978.49 |
| Portland | 2976 | $1,098.24 |
| Denver | 2847 | $257.29 |
NA
NA
DonorSegment Analysis
#grouping by donorsegment and analyzing
dataclean %>%
group_by(Donor.Segment) %>%
summarise(Count = length(Donor.Segment),
mean_total_giv = mean(HH.Lifetime.Giving)) %>%
arrange(-Count) %>%
filter(Count >= 100) %>%
#added scales package to have the values show in dollar
mutate(mean_total_giv = (dollar(mean_total_giv))) %>%
kable(col.names = c("Donor Segment", "Count", "Mean HH Lifetime Giving"), align=rep('c', 3)) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| Donor Segment | Count | Mean HH Lifetime Giving |
|---|---|---|
| NA | 231974 | $0.00 |
| Lost Donor | 69718 | $4,954.47 |
| Lapsed Donor | 11193 | $10,069.79 |
| Current Donor | 5603 | $90,638.32 |
| Lapsing Donor | 3862 | $16,590.15 |
| At-Risk Donor | 650 | $77,143.93 |
NA
NA
First gift size
aq <- quantile(dataclean$HH.First.Gift.Amount, probs = c(.25,.50,.75,.9,.99), na.rm = TRUE)
aq <- as.data.frame(aq)
aq$aq <- dollar(aq$aq)
aq %>%
kable(col.names = "Quantile") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| Quantile | |
|---|---|
| 25% | $0.00 |
| 50% | $0.00 |
| 75% | $0.00 |
| 90% | $40.00 |
| 99% | $1,910.06 |
NA
NA
Consecutive giving
#consecutive years of giving
dataclean %>%
filter(Max.Consec.Fiscal.Years > 0) %>%
ggplot(aes(Max.Consec.Fiscal.Years)) + geom_histogram(fill = "#002845", bins = 20) +
theme_economist_white() +
ggtitle("Consecutive Years of Giving Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,2)) +
scale_y_continuous(breaks = seq(0,10000000,5000))
Lifetime giving based on number of children
dataclean %>%
filter(HH.Lifetime.Giving <= 10000) %>%
filter(HH.Lifetime.Giving > 0) %>%
mutate(`No_of_Children` = as.factor(`No_of_Children`)) %>%
ggplot(aes(HH.Lifetime.Giving, fill = `No_of_Children`)) + geom_histogram(bins = 30) + theme_economist_white() +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,100000,1000)) +
scale_y_continuous(breaks = seq(0,100000000,5000)) +
ggtitle("Giving distribution and number of children")+
scale_fill_manual(values=c("#002845", "#00cfcc", "#ff9973"))
Mean, Median, and Count of Giving in Age Ranges
age_range_giving <- dataclean %>%
group_by(age_range) %>%
summarise(avg_giving = mean(HH.Lifetime.Giving, na.rm = TRUE),
med_giving = median(HH.Lifetime.Giving, na.rm = TRUE),
amount_of_people_in_age_range = n())
glimpse(age_range_giving)
2a) Plotting average giving by age range
age_range_giving <-
age_range_giving %>%
mutate(age_range = factor(age_range))
ggplot(age_range_giving, aes(age_range, avg_giving)) +
geom_bar(stat = "identity")+
theme(axis.text.x = element_text(angle=45,
hjust=1))
2b) Count of donors based on age range (another way to look at it)
ggplot(dataclean,
aes(age_range)) +
geom_bar() +
theme(axis.text.x = element_text(angle=45,
hjust=1)) +
labs(title = "Count of Age Ranges", x = "", y = "")
2c) Boxplot of the Age Ranges Against the Lifetime Giving Amounts with a log scale applied - the reason we applied log scale is to resolve issues with visualizations that skew towards large values in our dataset.
ggplot(dataclean, aes(age_range,HH.Lifetime.Giving,fill = age_range)) +
geom_boxplot(
outlier.colour = "red") +
scale_y_log10() +
theme(axis.text.x=element_text(angle=45,hjust=1))
2d) Splitting by age and gender
#creating boxplots
dataclean %>%
filter(Age < 100) %>% #removing the weird outliers that are over 100
filter(Sex %in% c("M", "F")) %>%
ggplot(aes(Sex, Age)) +
geom_boxplot() +
theme_economist() +
ggtitle("Ages of Donors Based on Gender") +
xlab(NULL) + ylab(NULL)
Giving by gender
#remove NAs U X
dataclean2 <- dataclean %>%
filter(Sex %in% c("M", "F"))
q <- ggplot(dataclean2)
q + stat_summary_bin(
aes(y = HH.Lifetime.Giving, x = Sex),
fun.y = "mean", geom = "bar")
summary(dataclean$sex_simple)
Mean age by gender
#breakdown of sexs
tally(group_by(dataclean, Sex))
summarize(group_by(dataclean, Sex),
avg_giving = mean(HH.Lifetime.Giving, na.rm = TRUE),
avg_age = mean(Age, na.rm = TRUE),
med_age = median(Age, na.rm = TRUE))
#grouping by sex and age range for slides
tally(group_by(dataclean, Sex, age_range))
2e) Distribution of people in the states that they live.
dataclean %>%
mutate(State = ifelse(State == " ", "NA", State)) %>%
filter(State != "NA") %>%
group_by(State) %>%
summarise(Count = length(State)) %>%
filter(Count > 800) %>%
arrange(-Count) %>%
kable(col.names = c("Donor's State", "Count")) %>%
kable_styling(bootstrap_options = c("condensed"),
full_width = F)
2f) Looking at all donors first gift amount. 75% made a first gift of <100.
no_non_donors <- dataclean %>%
filter(Lifetime.Giving != 0)
nd <- quantile(no_non_donors$HH.First.Gift.Amount, probs = c(.25,.50,.75,.9,.99), na.rm = TRUE)
nd <- as.data.frame(nd)
nd %>%
kable(col.names = "Quantile") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
Split data
#converting married Y and N to 1 and 0
dataclean <- dataclean %>%
mutate(Married_simple = ifelse(Married == "N",0,1))
dataclean <- dataclean %>%
mutate(hh.lifetime.giving_fct = as.factor(HH.Lifetime.Giving)) %>%
mutate(HH.Lifetime.Giving.Plus = log(HH.Lifetime.Giving + 1))
library("rsample")
data_split <- initial_split(dataclean, prop = 0.75)
data_train <- training(data_split)
data_test <- testing(data_split)
p <- dataclean %>%
ggplot(aes(Age)) + geom_histogram(bins=30, fill = "blue") + theme_economist_white() +
ggtitle("Overall Donor Age Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(5,100,by = 20)) +
scale_y_continuous(breaks = seq(20,100,by = 20)) + xlim(c(20,100))
ggplotly(p)
p
ggplot(data = dataclean, aes(x = Age)) + geom_histogram(fill ="blue")+ xlim(c(20,100))
Another Histogram
dataclean %>%
filter(Age >= 10) %>%
filter(Age <= 90) %>%
ggplot(aes(Age)) + geom_histogram(fill = "#002845", bins = 20) + theme_economist_white() +
ggtitle("Overall Donor Age Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,5)) +
scale_y_continuous(breaks = seq(0,10000000,2000))
Age distribution by gender
#Age Gender filtered out below 15 and above 90 - also removed U X the weird values
dataclean %>%
filter(Age >= 15) %>%
filter(Age <= 90) %>%
mutate(Sex = as.factor(Sex)) %>%
filter(Sex != "U") %>%
filter(Sex != "X") %>%
ggplot(aes(Age, fill = Sex)) + geom_histogram(bins = 25) + theme_economist_white() +
ggtitle("Age Distribution by Gender") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,10)) +
scale_y_continuous(breaks = seq(0,50000,2000)) + scale_fill_manual(values=c("#ff9973", "#00cfcc"))
Donor age distribution by marital status
#Age Marital Status
dataclean %>%
filter(Age >= 20) %>%
filter(Age <= 85) %>%
ggplot(aes(Age, fill = Married)) + geom_histogram(bins = 25) + theme_economist_white() +
ggtitle("Overall Donor Age Distribution by Marital Status") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,5)) +
scale_y_continuous(breaks = seq(0,50000,2000)) + scale_fill_manual(values=c("#ff9973", "#00cfcc"))
Linear Model
#These will focus on predicting whether a constituent is a donor or non-donor.
mod1lm <- lm( Lifetime.Giving ~ Married_simple,
data = data_train)
mod2lm <- lm( Total.Giving.Years ~ Lifetime.Giving,
data = data_train)
mod3lm <- lm( Lifetime.Giving ~ region,
data = data_train)
summary(mod1lm)
Call:
lm(formula = Lifetime.Giving ~ Married_simple, data = data_train)
Residuals:
Min 1Q Median 3Q Max
-2867 -2742 -2661 -2661 18111464
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2660.9 251.3 10.588 <0.0000000000000002 ***
Married_simple 205.9 469.1 0.439 0.661
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 104400 on 242248 degrees of freedom
Multiple R-squared: 7.953e-07, Adjusted R-squared: -3.333e-06
F-statistic: 0.1927 on 1 and 242248 DF, p-value: 0.6607
summary(mod2lm)
Call:
lm(formula = Total.Giving.Years ~ Lifetime.Giving, data = data_train)
Residuals:
Min 1Q Median 3Q Max
-36.600 -0.554 -0.554 -0.554 39.403
Coefficients:
Estimate Std. Error t value
(Intercept) 0.55445026328 0.00396511550 139.83
Lifetime.Giving 0.00000343205 0.00000003795 90.43
Pr(>|t|)
(Intercept) <0.0000000000000002 ***
Lifetime.Giving <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.951 on 242248 degrees of freedom
Multiple R-squared: 0.03266, Adjusted R-squared: 0.03265
F-statistic: 8178 on 1 and 242248 DF, p-value: < 0.00000000000000022
summary(mod3lm)
Call:
lm(formula = Lifetime.Giving ~ region, data = data_train)
Residuals:
Min 1Q Median 3Q Max
-3977 -3968 -3968 -3598 18110156
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 513.0 950.9 0.539 0.589558
regionDenver -367.7 2739.3 -0.134 0.893220
regionNew York 1954.2 2160.7 0.904 0.365769
regionNor Cal 3464.0 1623.1 2.134 0.032826 *
regionPortland 161.0 2680.2 0.060 0.952111
regionSeattle -128.2 2088.2 -0.061 0.951057
regionSo Cal 3455.5 1016.1 3.401 0.000672 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 118200 on 144684 degrees of freedom
(97559 observations deleted due to missingness)
Multiple R-squared: 0.0001214, Adjusted R-squared: 7.989e-05
F-statistic: 2.927 on 6 and 144684 DF, p-value: 0.007435
#increasing the giving year one year increase total giving by 0.0035
ggplot(data = data_train, aes(x = Age, y = log(HH.Lifetime.Giving))) + geom_point(alpha = 1/10) + geom_smooth(method = lm) + facet_wrap(~region) + theme_clean(base_size = 8) + labs(x = "X", y = "Y") +
ggtitle("Region")
`geom_smooth()` using formula 'y ~ x'
ggplot(data = data_train, aes(x = Age, y = log(HH.Lifetime.Giving))) + geom_point(alpha = 1/10) + geom_smooth(method = lm) + facet_wrap(~nmb_degree) + theme_clean(base_size = 8) + labs(x = "X", y = "Y") +
ggtitle("Number of Degrees")
`geom_smooth()` using formula 'y ~ x'
ggplot(data = data_train, aes(x = Age, y = log(HH.First.Gift.Amount))) + geom_point(alpha = 1/10) + geom_smooth(method = lm) + facet_wrap(~donorseg_fct) + theme_clean(base_size = 8) + labs(x = "X", y = "Y") +
ggtitle("Donor Segment")
`geom_smooth()` using formula 'y ~ x'
#This plot actually has some interesting results
ggplot(data = data_train, aes(x = Age, y = log(Lifetime.Giving))) + geom_point(alpha = 1/10) + geom_smooth(method = lm) + facet_wrap(~No_of_Children) + theme_clean(base_size = 8) + labs(x = "X", y = "Y") +
ggtitle("# Children")
`geom_smooth()` using formula 'y ~ x'
data_train %>%
select_if(is.factor) %>%
glimpse()
Rows: 242,250
Columns: 54
$ major_gifter <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ sex_fct <fct> F, M, F, M, (Missing), M, M, (Missing…
$ sex_simple <fct> F, M, F, M, NA, M, M, NA, M, NA, M, M…
$ married_fct <fct> Y, Y, N, N, N, N, N, N, N, N, N, N, N…
$ donorseg_fct <fct> Lost Donor, (Missing), (Missing), (Mi…
$ donorseg_simple <fct> Lost Donor, NA, NA, NA, NA, Lost Dono…
$ contact_fct <fct> No Solicitations, (Missing), (Missing…
$ contact_simple <fct> No Solicitations, NA, NA, NA, NA, No …
$ spomail_fct <fct> No Solicitations, (Missing), (Missing…
$ spomail_simple <fct> No Solicitations, NA, NA, NA, NA, NA,…
$ jobtitle_fct <fct> (Missing), Manager, (Missing), Public…
$ jobtitle_simple <fct> NA, Other, NA, Other, NA, NA, NA, NA,…
$ deg1_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ deg1_simple <fct> NA, NA, NA, NA, NA, Bachelor of Arts,…
$ deg2_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ deg2_simple <fct> NA, NA, NA, NA, NA, Master of Arts, N…
$ maj1_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ maj1_simple <fct> NA, NA, NA, NA, NA, Other, Law (Full-…
$ maj2_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ maj2_simple <fct> NA, NA, NA, NA, NA, Other, NA, NA, NA…
$ min1_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ min1_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ min2_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ min2_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ school1_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ school1_simple <fct> NA, NA, NA, NA, NA, NA, Other, NA, NA…
$ school2_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ school2_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ insttype_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ insttype_simple <fct> NA, NA, NA, NA, NA, NA, Law JD Full-T…
$ extra_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ extra_simple <fct> NA, NA, NA, NA, NA, Other, NA, NA, NA…
$ hhfirstgift_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ hhfirstgift_simple <fct> NA, NA, NA, NA, NA, Pre-SRN Conversio…
$ ch1_enroll_fct <fct> (Missing), Program Completed, (Missin…
$ ch1_enroll_simple <fct> NA, NA, NA, NA, NA, Program Completed…
$ ch1_maj_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ ch1_maj_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ ch1_min_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ ch1_min_simple <fct> NA, NA, NA, NA, NA, Non-Degree: GR Ta…
$ ch1_school_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ ch1_school_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ ch1_feeder_fct <fct> (Missing), Palm Beach State College, …
$ ch1_feeder_simple <fct> NA, Other, NA, NA, NA, NA, NA, NA, NA…
$ ch2_enroll_simple <fct> NA, Program Completed, NA, NA, NA, NA…
$ ch2_maj_fct <fct> (Missing), Business Administration BS…
$ ch2_maj_simple <fct> NA, Business Administration BS, NA, N…
$ ch2_min_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ ch2_min_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ ch2_school_fct <fct> (Missing), George L. Argyros School o…
$ ch2_school_simple <fct> NA, George L. Argyros School of Busin…
$ ch2_feeder_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ ch2_feeder_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ hh.lifetime.giving_fct <fct> 25, 0, 0, 0, 0, 8048.75, 0, 0, 0, 0, …
MORE MODELS
Big logistic model
print(calc_auc(p_train)$AUC)
[1] 0.8930325
print(calc_auc(p_test)$AUC)
[1] 0.8773735
RIDGE
library('glmnet')
library('glmnetUtils')
ridge_fit1 <- cv.glmnet(HH.Lifetime.Giving.Plus ~ sex_fct + donorseg_fct + No_of_Children,
data = data_train,
alpha = 0)
#Alpha 0 sets the Ridge
print(ridge_fit1)
Call:
cv.glmnet.formula(formula = HH.Lifetime.Giving.Plus ~ sex_fct +
donorseg_fct + No_of_Children, data = data_train, alpha = 0)
Model fitting options:
Sparse model matrix: FALSE
Use model.frame: FALSE
Number of crossvalidation folds: 10
Alpha: 0
Deviance-minimizing lambda: 0.2202558 (+1 SE): 0.2652989
print(ridge_fit1$lambda.min)
[1] 0.2202558
print(ridge_fit1$lambda.1se)
[1] 0.2652989
plot(ridge_fit1)
LASSO
coef(lasso_fit)
37 x 1 sparse Matrix of class "dgCMatrix"
s1
(Intercept) 4.54735146
sex_fctF -0.07604783
sex_fctM .
sex_fctU .
sex_fctX .
sex_fct(Missing) .
jobtitle_simpleAttorney .
jobtitle_simpleOwner .
jobtitle_simplePresident 0.20429193
jobtitle_simpleTeacher .
jobtitle_simpleUnknown Position .
jobtitle_simpleOther .
nmb_degree .
school1_simpleCollege of Health and Behavioral Sciences .
school1_simpleDonna Ford Attallah College of Educational Studies .
school1_simpleGeorge L. Argyros School of Business and Economics 0.09311429
school1_simpleLawrence and Kristina Dodge Coll of Film & Media -0.36459784
school1_simpleWilkinson Coll of Arts Humanities & Soc Sciences .
school1_simpleOther .
hhfirstgift_simpleChapman Annual Scholarship Fund .
hhfirstgift_simpleChapman Fund -0.65373923
hhfirstgift_simpleJog-A-Thon .
hhfirstgift_simplePhonathon .
hhfirstgift_simplePre-SRN Conversion Gift History 0.42389757
hhfirstgift_simpleOther .
maj1_simpleBusiness Administration BS .
maj1_simpleEducation .
maj1_simpleLaw (Full-Time) .
maj1_simpleUndecided - UG .
maj1_simpleUnknown Major .
maj1_simpleOther .
donorseg_simpleAt-Risk Donor 0.90437087
donorseg_simpleCurrent Donor 2.22091067
donorseg_simpleLapsed Donor .
donorseg_simpleLapsing Donor 0.79860122
donorseg_simpleLost Donor -0.20881610
No_of_Children 0.76764765
#Default setting is lambda.1se
#From the book - showing convergence with lambda values
plot(lasso_fit$glmnet.fit, xvar="lambda")
enet_mod <- cva.glmnet(HH.Lifetime.Giving.Plus ~ sex_fct + jobtitle_simple + nmb_degree + school1_simple + hhfirstgift_simple + maj1_simple + donorseg_simple + No_of_Children + Married,
data = data_train,
alpha = seq(0,1, by = 0.1))
print(enet_mod)
Call:
cva.glmnet.formula(formula = HH.Lifetime.Giving.Plus ~ sex_fct +
jobtitle_simple + nmb_degree + school1_simple + hhfirstgift_simple +
maj1_simple + donorseg_simple + No_of_Children + Married,
data = data_train, alpha = seq(0, 1, by = 0.1))
Model fitting options:
Sparse model matrix: FALSE
Use model.frame: FALSE
Alpha values: 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1
Number of crossvalidation folds for lambda: 10
plot(enet_mod)
NA
NA
ELASTICNET
minlossplot(enet_mod,
cv.type = "min")
get_alpha <- function(fit) {
alpha <- fit$alpha
error <- sapply(fit$modlist,
function(mod) {min(mod$cvm)})
alpha[which.min(error)]
}
get_model_params <- function(fit) {
alpha <- fit$alpha
lambdaMin <- sapply(fit$modlist, `[[`, "lambda.min")
lambdaSE <- sapply(fit$modlist, `[[`, "lambda.1se")
error <- sapply(fit$modlist, function(mod) {min(mod$cvm)})
best <- which.min(error)
data.frame(alpha = alpha[best], lambdaMin = lambdaMin[best],
lambdaSE = lambdaSE[best], eror = error[best])
}
best_alpha <- get_alpha(enet_mod)
print(best_alpha)
[1] 0
get_model_params(enet_mod)
best_mod <- enet_mod$modlist[[which(enet_mod$alpha == best_alpha)]]
print(best_mod)
Call: glmnet::cv.glmnet(x = x, y = y, weights = ..1, offset = ..2, nfolds = nfolds, foldid = foldid, alpha = a)
Measure: Mean-Squared Error
Lambda Index Measure SE Nonzero
min 0.0814 100 2.596 0.08240 32
1se 1.0041 73 2.669 0.08313 32
Ridges plot - could be useful for plotting donations vs donor segment
ggplot(data_train, aes(x = HH.Lifetime.Giving, y = region)) + geom_density_ridges(rel_min_height = 0.005) + xlim(c(25000, 100000)) +
ggtitle("HH Lifetime Giving by Donor Segment")
Picking joint bandwidth of 8480
library('corrplot')
#removing ID zip and nonnumeric
corrplot_data <- dataclean[-c(1:48,52:56,58:60,63,66:67,70:72,74:81,83:132)]
#Convert from character to numeric data type
convert_fac2num <- function(x){
as.numeric(as.factor(x))
}
corrplot_data <- mutate_at(corrplot_data,
.vars = c(1:12),
.funs = convert_fac2num)
#making a matrix
cd_cor <- cor(corrplot_data)
#creating correlation
col <- colorRampPalette(c("#BB4400", "#EE9990",
"#FFFFFF", "#77AAEE", "#4477BB"))
corrplot(cd_cor, method="color", col=col(100),
type="lower", addCoef.col = "black",
tl.pos="lt", tl.col="black",
tl.cex=0.7, tl.srt=45,
number.cex=0.7,
diag=FALSE)
#correlation matrix
# pairs(~Age + Months.Since.Last.Gift + donorseg_fct +
# nmb_degree + No_of_Children + HH.First.Gift.Age + HH.First.Gift.Amount + Total.Giving.Years,
# col = corrplot_data$HH.Lifetime.Giving,
# data = corrplot_data,
# main = "Donor Scatter Plot Matrix")
#worthless..
ggplot(data = corrplot_data, aes(x = nmb_degree, y = HH.Lifetime.Giving)) +
geom_point(aplha = 1/10)+
geom_smooth(method = "lm", color ="red")
Random Forest
library('randomForest')
rf_fit_donor <- randomForest(Lifetime.Giving ~ .,
data = data_train,
type = classification,
mtry = 7,
na.action = na.roughfix,
ntree = 200,
importance=TRUE
)
print(rf_fit_donor)
varImpPlot(rf_fit_donor, sort = TRUE,
n.var = 5,
type = 2, class = NULL, scale = TRUE,
main = deparse(substitute(rf_fit_donor)))
library('randomForestExplainer')
plot_min_depth_distribution(
rf_fit_donor,
k = 10,
min_no_of_trees = 0,
mean_sample = "top_trees",
mean_scale = FALSE,
mean_round = 2,
main = "Distribution of minimal depth and its mean"
)
#Splitting Category out to check if the category is useful for analysis
data_category_split_out <- dataclean %>%
mutate(Category.Codes = trim(strsplit(as.character(Category.Codes), "|", fixed = TRUE))) %>%
unnest(Category.Codes) %>% pivot_wider(names_from = Category.Codes,values_from =Category.Codes, values_fn = length)